options(warn = -1)
options(scipen = 999) # turns off scientific notation
library(tidyverse)
library(tidymodels)
library(dplyr)
library(janitor)
library(solitude) # Isolation Forest
library(ggpubr) # Isolation Forest
library(skimr)
library(modelr)
library(GGally)
library(kableExtra) # make nice looking results when we knit
library(fastshap) # shapley values for variable importance
library(MASS)
library(tree)
library(ggplot2)
library(corrplot)
library(factoextra)
library(rpart.plot) # plotting decision trees
library(lubridate)
library(vip)
library(NeuralNetTools) # visualization of neural networks
library(reshape2)
library(PerformanceAnalytics)
library(DALEX) # new
library(DALEXtra) # new
library(caret)loan <- read_csv("loan_train.csv") %>% clean_names() %>%
mutate(issue_d_year = year(Sys.Date()) - year(my(issue_d))) %>%
mutate(earliest_cr_line_year = year(Sys.Date()) - year(my(earliest_cr_line))) %>%
mutate(last_pymnt_d_year = year(Sys.Date()) - year(my(last_pymnt_d))) %>%
mutate(last_credit_pull_d_year = year(Sys.Date()) - year(my(last_credit_pull_d))) %>%
dplyr::select(-issue_d, -earliest_cr_line, -last_pymnt_d, -last_credit_pull_d)
head(loan)nrow(loan)## [1] 29777
skim(loan)| Name | loan |
| Number of rows | 29777 |
| Number of columns | 52 |
| _______________________ | |
| Column type frequency: | |
| character | 19 |
| numeric | 33 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| term | 3 | 1.00 | 9 | 9 | 0 | 2 | 0 |
| int_rate | 3 | 1.00 | 5 | 6 | 0 | 390 | 0 |
| grade | 3 | 1.00 | 1 | 1 | 0 | 7 | 0 |
| sub_grade | 3 | 1.00 | 2 | 2 | 0 | 35 | 0 |
| emp_title | 1817 | 0.94 | 2 | 75 | 0 | 22143 | 0 |
| emp_length | 3 | 1.00 | 3 | 9 | 0 | 12 | 0 |
| home_ownership | 3 | 1.00 | 3 | 8 | 0 | 5 | 0 |
| verification_status | 3 | 1.00 | 8 | 15 | 0 | 3 | 0 |
| loan_status | 0 | 1.00 | 7 | 7 | 0 | 2 | 0 |
| pymnt_plan | 3 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| url | 3 | 1.00 | 62 | 64 | 0 | 29774 | 0 |
| desc | 9432 | 0.68 | 1 | 3943 | 0 | 20310 | 0 |
| purpose | 3 | 1.00 | 3 | 18 | 0 | 14 | 0 |
| title | 13 | 1.00 | 2 | 80 | 0 | 15200 | 0 |
| zip_code | 3 | 1.00 | 5 | 5 | 0 | 819 | 0 |
| addr_state | 3 | 1.00 | 2 | 2 | 0 | 50 | 0 |
| revol_util | 67 | 1.00 | 2 | 6 | 0 | 1094 | 0 |
| next_pymnt_d | 27425 | 0.08 | 8 | 8 | 0 | 96 | 0 |
| application_type | 3 | 1.00 | 10 | 10 | 0 | 1 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 3 | 1.00 | 663006.18 | 219881.52 | 54734.00 | 496344.25 | 642763.00 | 825647.50 | 1077501.00 | ▁▃▇▆▅ |
| member_id | 3 | 1.00 | 823568.15 | 280420.33 | 70473.00 | 635380.00 | 822247.50 | 1033656.25 | 1314167.00 | ▁▃▇▇▆ |
| loan_amnt | 3 | 1.00 | 11109.43 | 7404.65 | 500.00 | 5225.00 | 9800.00 | 15000.00 | 35000.00 | ▇▇▃▂▁ |
| funded_amnt | 3 | 1.00 | 10843.64 | 7147.05 | 500.00 | 5100.00 | 9600.00 | 15000.00 | 35000.00 | ▇▇▃▂▁ |
| funded_amnt_inv | 3 | 1.00 | 10149.66 | 7130.86 | 0.00 | 4950.00 | 8500.00 | 14000.00 | 35000.00 | ▇▆▃▁▁ |
| installment | 3 | 1.00 | 323.81 | 209.77 | 15.67 | 165.84 | 278.94 | 429.86 | 1305.19 | ▇▆▂▁▁ |
| annual_inc | 4 | 1.00 | 69201.23 | 66566.42 | 2000.00 | 40000.00 | 59000.00 | 82500.00 | 6000000.00 | ▇▁▁▁▁ |
| dti | 3 | 1.00 | 13.38 | 6.74 | 0.00 | 8.19 | 13.49 | 18.70 | 29.99 | ▅▇▇▆▂ |
| delinq_2yrs | 23 | 1.00 | 0.16 | 0.52 | 0.00 | 0.00 | 0.00 | 0.00 | 13.00 | ▇▁▁▁▁ |
| fico_range_low | 3 | 1.00 | 713.05 | 36.31 | 610.00 | 685.00 | 710.00 | 740.00 | 825.00 | ▁▇▇▅▁ |
| fico_range_high | 3 | 1.00 | 717.05 | 36.31 | 614.00 | 689.00 | 714.00 | 744.00 | 829.00 | ▁▇▇▅▁ |
| inq_last_6mths | 23 | 1.00 | 1.08 | 1.54 | 0.00 | 0.00 | 1.00 | 2.00 | 33.00 | ▇▁▁▁▁ |
| mths_since_last_delinq | 18907 | 0.37 | 34.72 | 22.32 | 0.00 | 17.00 | 32.00 | 51.00 | 120.00 | ▇▇▅▂▁ |
| mths_since_last_record | 27208 | 0.09 | 59.23 | 47.22 | 0.00 | 0.00 | 85.00 | 102.00 | 129.00 | ▇▁▂▆▅ |
| open_acc | 23 | 1.00 | 9.34 | 4.51 | 1.00 | 6.00 | 9.00 | 12.00 | 47.00 | ▇▃▁▁▁ |
| pub_rec | 23 | 1.00 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 | ▇▁▁▁▁ |
| revol_bal | 3 | 1.00 | 14310.00 | 22696.55 | 0.00 | 3644.00 | 8783.50 | 17187.00 | 1207359.00 | ▇▁▁▁▁ |
| total_acc | 23 | 1.00 | 22.08 | 11.59 | 1.00 | 13.00 | 20.00 | 29.00 | 81.00 | ▇▇▂▁▁ |
| out_prncp | 3 | 1.00 | 11.80 | 123.53 | 0.00 | 0.00 | 0.00 | 0.00 | 3126.61 | ▇▁▁▁▁ |
| out_prncp_inv | 3 | 1.00 | 11.76 | 123.23 | 0.00 | 0.00 | 0.00 | 0.00 | 3123.44 | ▇▁▁▁▁ |
| total_rec_late_fee | 3 | 1.00 | 1.50 | 7.72 | 0.00 | 0.00 | 0.00 | 0.00 | 180.20 | ▇▁▁▁▁ |
| last_pymnt_amnt | 3 | 1.00 | 2615.41 | 4373.70 | 0.00 | 211.02 | 531.98 | 3171.29 | 36115.20 | ▇▁▁▁▁ |
| collections_12_mths_ex_med | 104 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| policy_code | 3 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
| acc_now_delinq | 23 | 1.00 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| chargeoff_within_12_mths | 104 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| delinq_amnt | 23 | 1.00 | 0.20 | 35.09 | 0.00 | 0.00 | 0.00 | 0.00 | 6053.00 | ▇▁▁▁▁ |
| pub_rec_bankruptcies | 966 | 0.97 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | ▇▁▁▁▁ |
| tax_liens | 79 | 1.00 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| issue_d_year | 3 | 1.00 | 11.78 | 0.97 | 11.00 | 11.00 | 11.00 | 12.00 | 15.00 | ▇▅▂▁▁ |
| earliest_cr_line_year | 23 | 1.00 | 25.43 | 6.85 | 14.00 | 21.00 | 24.00 | 29.00 | 76.00 | ▇▃▁▁▁ |
| last_pymnt_d_year | 67 | 1.00 | 9.30 | 1.61 | 6.00 | 8.00 | 9.00 | 10.00 | 15.00 | ▂▇▆▂▁ |
| last_credit_pull_d_year | 5 | 1.00 | 7.43 | 1.81 | 6.00 | 6.00 | 6.00 | 9.00 | 15.00 | ▇▃▂▁▁ |
# drop columns: desc, next_pymnt_d, mths_since_last_delinq, mths_since_last_record (>20% missing values)
loan_kaggle <- read_csv("loan_holdout.csv") %>% clean_names() %>%
mutate(issue_d_year = year(Sys.Date()) - year(my(issue_d))) %>%
mutate(earliest_cr_line_year = year(Sys.Date()) - year(my(earliest_cr_line))) %>%
mutate(last_pymnt_d_year = year(Sys.Date()) - year(my(last_pymnt_d))) %>%
mutate(last_credit_pull_d_year = year(Sys.Date()) - year(my(last_credit_pull_d))) %>%
dplyr::select(-issue_d, -earliest_cr_line, -last_pymnt_d, -last_credit_pull_d)
head(loan_kaggle)loan_summary <- loan %>%
count(loan_status) %>%
mutate(pct = n/sum(n))
loan_summaryloan_summary %>%
ggplot(aes(x=factor(loan_status),y=pct)) +
geom_col() +
geom_text(aes(x=factor(loan_status), y=pct+0.034, label=round(pct,2)), vjust=2.75, colour="white") +
labs(title="Loan Status", x="Loan Status", y="PCT")numeric variables: loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, delinq_2yrs, fico_range_low, fico_range_high, inq_last_6mths, open_acc, revol_bal, revol_util, total_acc, out_prncp, out_prncp_inv, total_rec_late_fee, total_rec_late_fee, last_pymnt_amnt, issue_d_year, earliest_cr_line_year, last_pymnt_d_year, last_credit_pull_d_year
# convert percentage to decimal
loan_decimal <- loan
loan_decimal$int_rate <- as.numeric(lapply(loan$int_rate, function(x) as.numeric(sub("%", "", x))/100))
loan_decimal$revol_util <- as.numeric(lapply(loan$revol_util, function(x) as.numeric(sub("%", "", x))/100))
loan_kaggle_decimal <- loan_kaggle
loan_kaggle_decimal$int_rate <- as.numeric(lapply(loan_kaggle$int_rate, function(x) as.numeric(sub("%", "", x))/100))
loan_kaggle_decimal$revol_util <- as.numeric(lapply(loan_kaggle$revol_util, function(x) as.numeric(sub("%", "", x))/100))
# remove outliers
loan_without_outlier <- loan_decimal %>% filter(annual_inc < 500000 & revol_bal < 300000)
# comparative boxplots
boxplot <- function(m){
ggplot(loan_without_outlier, aes(x=!!as.name(m), y=as.factor(loan_status), fill=as.factor(loan_status))) +
geom_boxplot() +
labs(title = as.character(m), y = 'Loan Status') +
theme(legend.title = element_blank())
}
numerics <- c('loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_late_fee', 'last_pymnt_amnt', 'issue_d_year', 'earliest_cr_line_year', 'last_pymnt_d_year', 'last_credit_pull_d_year')
for (c in numerics){
print(boxplot(c))
}loan_without_outlier %>% ggplot(aes(x=loan_amnt)) + geom_histogram(binwidth=500) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=funded_amnt)) + geom_histogram(binwidth=500) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=funded_amnt_inv)) + geom_histogram(binwidth=500) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=int_rate)) + geom_histogram(binwidth=0.01) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=installment)) + geom_histogram(binwidth=20) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=annual_inc)) + geom_histogram(binwidth=5000) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=dti)) + geom_histogram(binwidth=1) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=delinq_2yrs)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=fico_range_low)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=fico_range_high)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=inq_last_6mths)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=open_acc)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=revol_bal)) + geom_histogram(binwidth=5000) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=revol_util)) + geom_histogram(binwidth=0.05) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=total_acc)) + geom_histogram(binwidth=2) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=out_prncp)) + geom_histogram(binwidth=100) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=out_prncp_inv)) + geom_histogram(binwidth=100) + theme(axis.text.x=element_text(angle=45, hjust=1))# remove variable: out_prncp, out_prncp_inv (only zero values)
loan_without_outlier %>% ggplot(aes(x=last_pymnt_amnt)) + geom_histogram(binwidth=500) + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=last_pymnt_d_year)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=issue_d_year)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=earliest_cr_line_year)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=last_pymnt_d_year)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))loan_without_outlier %>% ggplot(aes(x=last_credit_pull_d_year)) + geom_bar() + theme(axis.text.x=element_text(angle=45, hjust=1))categorical variables: term, grade, sub_grade, emp_length, home_ownership, verification_status, pymnt_plan, purpose, addr_state, pub_rec, collections_12_mths_ex_med, policy_code, application_type, acc_now_delinq, chargeoff_within_12_mths, delinq_amnt, pub_rec_bankruptcies, tax_liens
loan_decimal$pub_rec <- as.character(loan_decimal$pub_rec)
loan_decimal$collections_12_mths_ex_med <- as.character(loan_decimal$collections_12_mths_ex_med)
loan_decimal$policy_code <- as.character(loan_decimal$policy_code)
loan_decimal$application_type <- as.character(loan_decimal$application_type)
loan_decimal$acc_now_delinq <- as.character(loan_decimal$acc_now_delinq)
loan_decimal$chargeoff_within_12_mths <- as.character(loan_decimal$chargeoff_within_12_mths)
loan_decimal$delinq_amnt <- as.character(loan_decimal$delinq_amnt)
loan_decimal$tax_liens <- as.character(loan_decimal$tax_liens)
char_fill <- function(col){
loan_decimal %>%
na.omit() %>%
ggplot(aes(!!as.name(col), fill = as.factor(loan_status))) +
geom_bar(position = 'fill') +
coord_flip() +
labs(y = 'proportion') +
theme(legend.title = element_blank())
}
dummy <- c('term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'addr_state', 'pub_rec', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens')
# -- for each character column, create a chart
for (column in dummy){
print(char_fill(column))
}# drop variables with only one category / imbalanced variables: pymnt_plan, collections_12_mths_ex_med, policy_code, application_type, acc_now_delinq, chargeoff_within_12_mths, delinq_amnt, tax_liens
bar <- function(col){
loan_decimal %>%
na.omit() %>%
# ggplot(aes(!!as.name(col))) +
# geom_bar() +
ggplot(aes(!!as.name(col)), fill = as.factor(loan_status)) +
geom_bar(position = 'identity') +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
dummy <- c('term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'addr_state', 'pub_rec', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens')
for (column in dummy){
print(bar(column))
}create a correlation matrix of key numeric varaibles: loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, fico_range_low, fico_range_high, open_acc, revol_bal, revol_util, total_acc, last_pymnt_amnt
hint: you need to deal with missing values
cor_analysis <- loan_decimal %>%
na.omit() %>%
dplyr::select(loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, fico_range_low, fico_range_high, open_acc, revol_bal, revol_util, total_acc, last_pymnt_amnt) %>%
cor() %>%
melt() %>% #turn it into a dataframe
arrange(desc(value))
cor_analysis_1 <- loan_decimal %>%
na.omit() %>%
dplyr::select(loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, fico_range_low, fico_range_high, open_acc, revol_bal, revol_util, total_acc, last_pymnt_amnt)
cormat <- cor(cor_analysis_1)
round(cormat, 2) ## loan_amnt funded_amnt funded_amnt_inv int_rate installment
## loan_amnt 1.00 0.99 0.59 0.17 0.97
## funded_amnt 0.99 1.00 0.59 0.15 0.98
## funded_amnt_inv 0.59 0.59 1.00 0.37 0.54
## int_rate 0.17 0.15 0.37 1.00 0.12
## installment 0.97 0.98 0.54 0.12 1.00
## annual_inc 0.45 0.45 0.26 0.03 0.47
## dti 0.15 0.15 0.09 0.12 0.16
## fico_range_low 0.22 0.24 0.25 -0.33 0.19
## fico_range_high 0.22 0.24 0.25 -0.33 0.19
## open_acc 0.12 0.11 0.20 0.05 0.14
## revol_bal 0.39 0.40 0.06 -0.13 0.42
## revol_util 0.02 0.02 0.06 0.01 0.03
## total_acc 0.03 0.04 0.12 0.03 0.07
## last_pymnt_amnt 0.25 0.27 0.33 0.11 0.24
## annual_inc dti fico_range_low fico_range_high open_acc
## loan_amnt 0.45 0.15 0.22 0.22 0.12
## funded_amnt 0.45 0.15 0.24 0.24 0.11
## funded_amnt_inv 0.26 0.09 0.25 0.25 0.20
## int_rate 0.03 0.12 -0.33 -0.33 0.05
## installment 0.47 0.16 0.19 0.19 0.14
## annual_inc 1.00 -0.08 0.02 0.02 0.18
## dti -0.08 1.00 0.13 0.13 0.43
## fico_range_low 0.02 0.13 1.00 1.00 0.00
## fico_range_high 0.02 0.13 1.00 1.00 0.00
## open_acc 0.18 0.43 0.00 0.00 1.00
## revol_bal 0.39 0.29 0.27 0.27 0.27
## revol_util -0.06 0.26 -0.26 -0.26 0.27
## total_acc 0.21 0.35 0.05 0.05 0.70
## last_pymnt_amnt 0.08 0.01 0.00 0.00 0.08
## revol_bal revol_util total_acc last_pymnt_amnt
## loan_amnt 0.39 0.02 0.03 0.25
## funded_amnt 0.40 0.02 0.04 0.27
## funded_amnt_inv 0.06 0.06 0.12 0.33
## int_rate -0.13 0.01 0.03 0.11
## installment 0.42 0.03 0.07 0.24
## annual_inc 0.39 -0.06 0.21 0.08
## dti 0.29 0.26 0.35 0.01
## fico_range_low 0.27 -0.26 0.05 0.00
## fico_range_high 0.27 -0.26 0.05 0.00
## open_acc 0.27 0.27 0.70 0.08
## revol_bal 1.00 0.06 0.30 0.04
## revol_util 0.06 1.00 0.20 -0.03
## total_acc 0.30 0.20 1.00 0.08
## last_pymnt_amnt 0.04 -0.03 0.08 1.00
corrplot(cormat)pairs(cor_analysis_1)chart.Correlation(cor_analysis_1, histogram=TRUE, pch=4)cor_analysis %>%
ggplot(aes(Var2, Var1, fill = value)) +
geom_tile(color = "black")+ geom_text(aes(label = round(value,2)), color = "white", size = 3) +
coord_fixed() +
theme(axis.text.x=element_text(angle=45, hjust=1))data_prep <- loan_decimal %>% dplyr::select(loan_status, loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, delinq_2yrs, fico_range_low, fico_range_high, inq_last_6mths, open_acc, revol_bal, revol_util, total_acc, total_rec_late_fee, last_pymnt_amnt, issue_d_year, earliest_cr_line_year, last_pymnt_d_year, last_credit_pull_d_year, term, grade, sub_grade, emp_length, home_ownership, verification_status, purpose, addr_state, pub_rec, tax_liens) %>%
mutate(loan_status = as.factor(loan_status)) %>%
mutate_if(is.character,factor)
head(data_prep)data_prep %>% skim()| Name | Piped data |
| Number of rows | 29777 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| factor | 11 |
| numeric | 21 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| loan_status | 0 | 1 | FALSE | 2 | cur: 25300, def: 4477 |
| term | 3 | 1 | FALSE | 2 | 36 : 22160, 60 : 7614 |
| grade | 3 | 1 | FALSE | 7 | B: 8620, A: 7142, C: 6068, D: 4268 |
| sub_grade | 3 | 1 | FALSE | 35 | B3: 2088, A4: 2044, A5: 1957, B5: 1932 |
| emp_length | 3 | 1 | FALSE | 12 | 10+: 6577, < 1: 3491, 2 y: 3313, 3 y: 3008 |
| home_ownership | 3 | 1 | FALSE | 5 | REN: 14064, MOR: 13340, OWN: 2275, OTH: 91 |
| verification_status | 3 | 1 | FALSE | 3 | Not: 13128, Ver: 9460, Sou: 7186 |
| purpose | 3 | 1 | FALSE | 14 | deb: 13816, cre: 3850, oth: 3108, hom: 2226 |
| addr_state | 3 | 1 | FALSE | 50 | CA: 5188, NY: 2836, FL: 2200, TX: 2067 |
| pub_rec | 23 | 1 | FALSE | 6 | 0: 28086, 1: 1610, 2: 45, 3: 11 |
| tax_liens | 79 | 1 | FALSE | 2 | 0: 29697, 1: 1 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| loan_amnt | 3 | 1 | 11109.43 | 7404.65 | 500.00 | 5225.00 | 9800.00 | 15000.00 | 35000.00 | ▇▇▃▂▁ |
| funded_amnt | 3 | 1 | 10843.64 | 7147.05 | 500.00 | 5100.00 | 9600.00 | 15000.00 | 35000.00 | ▇▇▃▂▁ |
| funded_amnt_inv | 3 | 1 | 10149.66 | 7130.86 | 0.00 | 4950.00 | 8500.00 | 14000.00 | 35000.00 | ▇▆▃▁▁ |
| int_rate | 3 | 1 | 0.12 | 0.04 | 0.05 | 0.10 | 0.12 | 0.15 | 0.24 | ▆▇▇▂▁ |
| installment | 3 | 1 | 323.81 | 209.77 | 15.67 | 165.84 | 278.94 | 429.86 | 1305.19 | ▇▆▂▁▁ |
| annual_inc | 4 | 1 | 69201.23 | 66566.42 | 2000.00 | 40000.00 | 59000.00 | 82500.00 | 6000000.00 | ▇▁▁▁▁ |
| dti | 3 | 1 | 13.38 | 6.74 | 0.00 | 8.19 | 13.49 | 18.70 | 29.99 | ▅▇▇▆▂ |
| delinq_2yrs | 23 | 1 | 0.16 | 0.52 | 0.00 | 0.00 | 0.00 | 0.00 | 13.00 | ▇▁▁▁▁ |
| fico_range_low | 3 | 1 | 713.05 | 36.31 | 610.00 | 685.00 | 710.00 | 740.00 | 825.00 | ▁▇▇▅▁ |
| fico_range_high | 3 | 1 | 717.05 | 36.31 | 614.00 | 689.00 | 714.00 | 744.00 | 829.00 | ▁▇▇▅▁ |
| inq_last_6mths | 23 | 1 | 1.08 | 1.54 | 0.00 | 0.00 | 1.00 | 2.00 | 33.00 | ▇▁▁▁▁ |
| open_acc | 23 | 1 | 9.34 | 4.51 | 1.00 | 6.00 | 9.00 | 12.00 | 47.00 | ▇▃▁▁▁ |
| revol_bal | 3 | 1 | 14310.00 | 22696.55 | 0.00 | 3644.00 | 8783.50 | 17187.00 | 1207359.00 | ▇▁▁▁▁ |
| revol_util | 67 | 1 | 0.49 | 0.28 | 0.00 | 0.26 | 0.50 | 0.73 | 1.19 | ▇▇▇▇▁ |
| total_acc | 23 | 1 | 22.08 | 11.59 | 1.00 | 13.00 | 20.00 | 29.00 | 81.00 | ▇▇▂▁▁ |
| total_rec_late_fee | 3 | 1 | 1.50 | 7.72 | 0.00 | 0.00 | 0.00 | 0.00 | 180.20 | ▇▁▁▁▁ |
| last_pymnt_amnt | 3 | 1 | 2615.41 | 4373.70 | 0.00 | 211.02 | 531.98 | 3171.29 | 36115.20 | ▇▁▁▁▁ |
| issue_d_year | 3 | 1 | 11.78 | 0.97 | 11.00 | 11.00 | 11.00 | 12.00 | 15.00 | ▇▅▂▁▁ |
| earliest_cr_line_year | 23 | 1 | 25.43 | 6.85 | 14.00 | 21.00 | 24.00 | 29.00 | 76.00 | ▇▃▁▁▁ |
| last_pymnt_d_year | 67 | 1 | 9.30 | 1.61 | 6.00 | 8.00 | 9.00 | 10.00 | 15.00 | ▂▇▆▂▁ |
| last_credit_pull_d_year | 5 | 1 | 7.43 | 1.81 | 6.00 | 6.00 | 6.00 | 9.00 | 15.00 | ▇▃▂▁▁ |
# deal w. categoricals
loan_recipe <- recipe(~.,data_prep) %>%
step_unknown(all_nominal_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
bake_loan <- bake(loan_recipe, data_prep)
# just numeric data
loan_numeric <- data_prep %>% select_if(is.numeric)
loan_recipe <- recipe(~.,loan_numeric) %>%
step_impute_median(all_numeric()) %>%
prep()iso_forest <- isolationForest$new(
sample_size = 256,
num_trees = 100,
max_depth = ceiling(log2(256)))
iso_forest$fit(bake_loan)## INFO [18:53:45.796] dataset has duplicated rows
## INFO [18:53:45.844] Building Isolation Forest ...
## INFO [18:53:46.484] done
## INFO [18:53:46.485] Computing depth of terminal nodes ...
## INFO [18:53:47.134] done
## INFO [18:53:48.232] Completed growing isolation forest
evaluate histogram pick a value of average_depth to identify anomalies. a shorter average depth means the point is more isolated and more likely an anomaly
pred_train <- iso_forest$predict(bake_loan)
pred_train %>%
ggplot(aes(average_depth)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 7, linetype="dotted",
color = "blue", size=1.5) +
labs(title="Isolation Forest Average Tree Depth")pred_train %>%
ggplot(aes(anomaly_score)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 0.62, linetype="dotted",
color = "blue", size=1.5) +
labs(title="Isolation Forest Anomaly Score Above 0.62")The steps of interpreting anomalies on a global level are:
train_pred <- bind_cols(iso_forest$predict(bake_loan),bake_loan) %>%
mutate(anomaly = as.factor(if_else(average_depth <= 7.1, "Anomaly","Normal")))
train_pred %>%
arrange(average_depth) %>%
count(anomaly)fmla <- as.formula(paste("anomaly ~ ", paste(bake_loan %>% colnames(), collapse= "+")))
outlier_tree <- decision_tree(min_n=2, tree_depth=3, cost_complexity = .01) %>%
set_mode("classification") %>%
set_engine("rpart") %>%
fit(fmla, data=train_pred)
outlier_tree$fit## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 5 Normal (0.00016791483 0.99983208517)
## 2) revol_bal>=404867.5 6 1 Normal (0.16666666667 0.83333333333)
## 4) loan_amnt>=24500 1 0 Anomaly (1.00000000000 0.00000000000) *
## 5) loan_amnt< 24500 5 0 Normal (0.00000000000 1.00000000000) *
## 3) revol_bal< 404867.5 29771 4 Normal (0.00013435894 0.99986564106)
## 6) last_pymnt_amnt>=27804.08 62 3 Normal (0.04838709677 0.95161290323)
## 12) revol_bal>=79776.5 3 1 Anomaly (0.66666666667 0.33333333333) *
## 13) revol_bal< 79776.5 59 1 Normal (0.01694915254 0.98305084746) *
## 7) last_pymnt_amnt< 27804.08 29709 1 Normal (0.00003365983 0.99996634017) *
library(rpart.plot) # -- plotting decision trees
rpart.plot(outlier_tree$fit,clip.right.labs = FALSE, branch = .3, under = TRUE, roundint=FALSE, extra=3)anomaly_rules <- rpart.rules(outlier_tree$fit,roundint=FALSE, extra = 4, cover = TRUE, clip.facs = TRUE) %>% clean_names() %>%
#filter(anomaly=="Anomaly") %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Anomaly") %>%
mutate(rule = paste(rule, " THEN ", anomaly )) %>%
mutate(rule = paste(rule," coverage ", cover)) %>%
dplyr::select( rule)anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Normal") %>%
mutate(rule = paste(rule, " THEN ", anomaly )) %>%
mutate(rule = paste(rule," coverage ", cover)) %>%
dplyr::select( rule)pred_train <- bind_cols(iso_forest$predict(bake_loan),
bake_loan)
pred_train %>%
arrange(desc(anomaly_score)) %>%
slice_max(order_by=anomaly_score,n=5)# Anomalous Records id: 2658, 2737, 5259, 29546, 29658data <- data_prep[-c(2658, 2737, 5259, 29546, 29658),]set.seed(1234)
# -- performs our train / test split
split <- initial_split(data, prop = 0.7)
# -- extract the training data form our banana split
train <- training(split)
# -- extract the test data
test <- testing(split)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(data) * 100)## [1] "Train PCT : 70.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(data) * 100)## [1] "Test PCT : 30.00%"
loan_recipe <- recipe(loan_status ~ .,
data = train) %>%
step_novel(all_nominal_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
logistic_spec <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
logistic_wf <- workflow() %>%
add_recipe(loan_recipe) %>%
add_model(logistic_spec) %>%
fit(train)
logistic_wf %>%
pull_workflow_fit() %>%
tidy() %>%
mutate(across(is.numeric,round,3))predict(logistic_wf, train, type="prob") %>%
bind_cols(predict(logistic_wf, train, type="class")) %>%
bind_cols(train) -> logistic_train
predict(logistic_wf, test, type="prob") %>%
bind_cols(predict(logistic_wf, test, type="class")) %>%
bind_cols(test) -> logistic_test
logistic_train %>%
metrics(loan_status, estimate = .pred_class, .pred_default) %>%
mutate(part="training") %>%
bind_rows(logistic_test %>%
metrics(loan_status, estimate = .pred_class, .pred_default) %>%
mutate(part="testing"))options(yardstick.event_first = FALSE)
# Variable Importance
logistic_wf %>%
extract_fit_parsnip() %>%
vi()logistic_wf %>%
extract_fit_parsnip() %>%
vip()logistic_train <- logistic_train %>% mutate(part = "training")
logistic_test <- logistic_test %>% mutate(part = "testing")
logistic_train %>% mutate(model = "train") %>%
bind_rows(logistic_test %>% mutate(model="test")) %>%
group_by(model) %>%
roc_curve(loan_status, .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.05, # 5% FPR
color = "red",
linetype = "longdash") +
geom_vline(xintercept = 0.25, # 25% FPR
color = "blue",
linetype = "longdash") +
geom_vline(xintercept = 0.75, # 75% FPR
color = "green",
linetype = "longdash") +
labs(title = "Logistic Regression ROC Curve" , x = "FPR(1 - specificity)", y = "TPR(recall)")logistic_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")logistic_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")logistic_train_mutate <- logistic_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
logistic_test_mutate <- logistic_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
bind_rows(logistic_train, logistic_test) %>%
group_by(part) %>%
dplyr::select(loan_status, .pred_class) -> train_test
precision <- train_test %>%
yardstick::precision(loan_status, .pred_class)
recall <- train_test %>%
yardstick::recall(loan_status, .pred_class)
logistic_train %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(logistic_train_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(logistic_train_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="training") %>%
bind_rows(logistic_test %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(logistic_test_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(logistic_test_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="testing")) %>%
bind_rows(
bind_cols(precision, recall) %>%
mutate(f1_score = 2*.estimate...4*.estimate...8/(.estimate...4 + .estimate...8)) %>%
mutate(.metric = "F1_Score") %>%
mutate(.estimator = "binary") %>%
dplyr::select(part...1, .metric, .estimator, f1_score) %>%
rename(c(".estimate" = "f1_score"))
) %>%
arrange(desc(part))# Score Distribution
logistic_test %>%
ggplot(aes(.pred_default,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = "nnet 1") -> scored_dist
print(scored_dist)# operating range 0 - 10%
operating_range <- logistic_test %>%
roc_curve(loan_status, .pred_default) %>%
mutate(
fpr = round((1 - specificity), 3),
tpr = round(sensitivity, 3),
score_threshold = round(.threshold, 5)
) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(score_threshold),4),
tpr = mean(tpr)) %>%
filter(fpr <= 0.5)
print(operating_range)## # A tibble: 501 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 Inf 0.0547
## 2 0.001 0.991 0.142
## 3 0.002 0.981 0.219
## 4 0.003 0.965 0.289
## 5 0.004 0.947 0.335
## 6 0.005 0.929 0.384
## 7 0.006 0.905 0.429
## 8 0.007 0.886 0.459
## 9 0.008 0.862 0.496
## 10 0.009 0.840 0.522
## # … with 491 more rows
# Precision Recall Chart
logistic_test %>%
pr_curve(loan_status, .pred_default) %>%
mutate(
recall = round(recall, 2),
.threshold = round(.threshold, 3),
precision = round(precision, 3)
) %>%
group_by(recall) %>%
summarise(precision = max(precision),
.threshold = min(.threshold))# create an explainer of a model
logistic_explainer <- explain_tidymodels(
logistic_wf,
data = train ,
y = train$loan_default ,
verbose = TRUE
)## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20840 rows 32 cols
## -> data : tibble converted into a data.frame
## -> target variable : not specified! ( WARNING )
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.0000000000000002220446 , mean = 0.1518757 , max = 0.9999777
## -> residual function : difference between y and yhat ( default )
## A new explainer has been created!
logistic_variable <- c('last_pymnt_d_year', 'last_credit_pull_d_year', 'issue_d_year', 'last_pymnt_amnt', 'total_rec_late_fee')
pdp <- function(m){
# create a profile of a single variable for a model
pdp_variable <- model_profile(
logistic_explainer,
variables = as.character(m)
)
# Plot it
plot(pdp_variable) +
ggtitle(paste0("Partial Dependence Plot for ", as.character(m))) +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
for (c in logistic_variable){
print(pdp(c))
}Here we use the hyper parameters
lasso_spec <- logistic_reg(penalty = 0.01, mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lasso_wf <- workflow() %>%
add_recipe(loan_recipe) %>%
add_model(lasso_spec) %>%
fit(train)
lasso_wf %>%
pull_workflow_fit() %>%
tidy() %>%
filter(estimate != 0)predict(lasso_wf, train, type="prob") %>%
bind_cols(predict(logistic_wf, train, type="class")) %>%
bind_cols(train) -> lasso_train
predict(lasso_wf, test, type="prob") %>%
bind_cols(predict(lasso_wf, test, type="class")) %>%
bind_cols(test) -> lasso_test
lasso_train %>%
metrics(loan_status, estimate = .pred_class, .pred_default) %>%
mutate(part="training") %>%
bind_rows(lasso_test %>%
metrics(loan_status, estimate = .pred_class, .pred_default) %>%
mutate(part="testing"))# variables in recipe: loan_amnt, annual_inc, inq_last_6mths, last_pymnt_amnt, term, int_rate, emp_length, purpose, issue_d_year, last_pymnt_year, grade, last_credit_pull_year, funded_amnt, fico_range_low, total_rec_late_feeoptions(yardstick.event_first = FALSE)
# Variable Importance
lasso_wf %>%
extract_fit_parsnip() %>%
vi()lasso_wf %>%
extract_fit_parsnip() %>%
vip()lasso_train <- lasso_train %>% mutate(part = "training")
lasso_test <- lasso_test %>% mutate(part = "testing")
lasso_train %>% mutate(model = "train") %>%
bind_rows(lasso_test %>% mutate(model="test")) %>%
group_by(model) %>%
roc_curve(loan_status, .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.05, # 5% FPR
color = "red",
linetype = "longdash") +
geom_vline(xintercept = 0.25, # 25% FPR
color = "blue",
linetype = "longdash") +
geom_vline(xintercept = 0.75, # 75% FPR
color = "green",
linetype = "longdash") +
labs(title = "Lasso ROC Curve" , x = "FPR(1 - specificity)", y = "TPR(recall)")lasso_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")lasso_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")lasso_train_mutate <- lasso_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
lasso_test_mutate <- lasso_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
bind_rows(lasso_train, lasso_test) %>%
group_by(part) %>%
dplyr::select(loan_status, .pred_class) -> train_test
precision <- train_test %>%
yardstick::precision(loan_status, .pred_class)
recall <- train_test %>%
yardstick::recall(loan_status, .pred_class)
lasso_train %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(logistic_train_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(logistic_train_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="training") %>%
bind_rows(lasso_test %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(logistic_test_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(logistic_test_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="testing")) %>%
bind_rows(
bind_cols(precision, recall) %>%
mutate(f1_score = 2*.estimate...4*.estimate...8/(.estimate...4 + .estimate...8)) %>%
mutate(.metric = "F1_Score") %>%
mutate(.estimator = "binary") %>%
dplyr::select(part...1, .metric, .estimator, f1_score) %>%
rename(c(".estimate" = "f1_score"))
) %>%
arrange(desc(part))# Score Distribution
lasso_test %>%
ggplot(aes(.pred_default,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = "nnet 1") -> scored_dist
print(scored_dist)# operating range 0 - 10%
operating_range <- lasso_test %>%
roc_curve(loan_status, .pred_default) %>%
mutate(
fpr = round((1 - specificity), 3),
tpr = round(sensitivity, 3),
score_threshold = round(.threshold, 5)
) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(score_threshold),4),
tpr = mean(tpr)) %>%
filter(fpr <= 0.5)
print(operating_range)## # A tibble: 501 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 Inf 0.0408
## 2 0.001 0.881 0.140
## 3 0.002 0.829 0.228
## 4 0.003 0.783 0.281
## 5 0.004 0.737 0.338
## 6 0.005 0.700 0.387
## 7 0.006 0.674 0.415
## 8 0.007 0.655 0.443
## 9 0.008 0.634 0.468
## 10 0.009 0.613 0.495
## # … with 491 more rows
# Precision Recall Chart
lasso_test %>%
pr_curve(loan_status, .pred_default) %>%
mutate(
recall = round(recall, 2),
.threshold = round(.threshold, 3),
precision = round(precision, 3)
) %>%
group_by(recall) %>%
summarise(precision = max(precision),
.threshold = min(.threshold))# create an explainer of a model
lasso_explainer <- explain_tidymodels(
logistic_wf,
data = train ,
y = train$loan_default ,
verbose = TRUE
)## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20840 rows 32 cols
## -> data : tibble converted into a data.frame
## -> target variable : not specified! ( WARNING )
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.0000000000000002220446 , mean = 0.1518757 , max = 0.9999777
## -> residual function : difference between y and yhat ( default )
## A new explainer has been created!
lasso_variable <- c('last_pymnt_amnt', 'term', 'tax_liens', 'addr_state', 'pub_rec')
pdp <- function(m){
# create a profile of a single variable for a model
pdp_variable <- model_profile(
lasso_explainer,
variables = as.character(m)
)
# Plot it
plot(pdp_variable) +
ggtitle(paste0("Partial Dependence Plot for ", as.character(m))) +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
for (c in lasso_variable){
print(pdp(c))
}recipe <- recipe(loan_status ~ loan_amnt + annual_inc + inq_last_6mths + last_pymnt_amnt + term + int_rate + emp_length + purpose + issue_d_year + last_pymnt_d_year + grade + last_credit_pull_d_year + funded_amnt + fico_range_low + total_rec_late_fee, data=train) %>%
step_impute_median(all_numeric_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>% # new factor levels
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_nzv(all_predictors()) %>%
prep()
recipe## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 15
##
## Training data contained 20840 data points and 60 incomplete rows.
##
## Operations:
##
## Median imputation for loan_amnt, annual_inc, inq_last_6mths, last_pym... [trained]
## Unknown factor level assignment for term, emp_length, purpose, grade [trained]
## Scaling for loan_amnt, annual_inc, inq_last_6mths, last_pym... [trained]
## Novel factor level assignment for term, emp_length, purpose, grade [trained]
## Dummy variables from term, emp_length, purpose, grade [trained]
## Sparse, unbalanced variable filter removed total_rec_late_fee, term_unknown, term_... [trained]
bake(recipe %>% prep(), train, composition = "tibble") %>% head()bake_train <- bake(recipe, new_data = train)
bake_test <- bake(recipe, new_data = test)# K-fold cross validation
kfold_splits <- vfold_cv(train, v=5)
nn_model <- mlp(hidden_units = tune(),
penalty=tune(),
epochs = tune(),
) %>%
set_engine("nnet") %>%
set_mode("classification")
nn_wflow <-workflow() %>%
add_recipe(recipe) %>%
add_model(nn_model)
nn_search_res <- nn_wflow %>%
tune_bayes(
resamples = kfold_splits,
# Generate five at semi-random to start
initial = 5,
iter = 50,
# How to measure performance?
metrics = metric_set(yardstick::roc_auc),
control = control_bayes(no_improve = 5, verbose = TRUE)
)##
## ❯ Generating a set of 5 initial parameter results
## ✓ Initialization complete
##
##
## ── Iteration 1 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9752 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=10, penalty=0.00713, epochs=13
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.8772 (+/-0.0232)
##
## ── Iteration 2 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9752 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=8, penalty=0.000000429, epochs=379
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9642 (+/-0.00384)
##
## ── Iteration 3 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9752 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=7, penalty=0.819, epochs=171
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9743 (+/-0.000951)
##
## ── Iteration 4 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9752 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=10, penalty=0.125, epochs=658
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ♥ Newest results: roc_auc=0.9758 (+/-0.00147)
##
## ── Iteration 5 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9758 (@iter 4)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=6, penalty=0.00000000103, epochs=12
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.8025 (+/-0.052)
##
## ── Iteration 6 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9758 (@iter 4)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=3, penalty=0.00000000757, epochs=170
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.8459 (+/-0.101)
##
## ── Iteration 7 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9758 (@iter 4)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=9, penalty=0.000000188, epochs=582
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9659 (+/-0.00371)
##
## ── Iteration 8 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9758 (@iter 4)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=8, penalty=0.000000000154, epochs=253
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.969 (+/-0.00238)
##
## ── Iteration 9 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9758 (@iter 4)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i hidden_units=7, penalty=0.00000000133, epochs=239
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9665 (+/-0.00453)
## ! No improvement for 5 iterations; returning current results.
Evaluate our tuning efforts
# Experiments
nn_search_res %>%
collect_metrics() nn_search_res %>%
select_best("roc_auc")tune_graph <- function(parm){
# Graph of learning rate
nn_search_res %>%
collect_metrics() %>%
ggplot(aes(!!as.name(parm), mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
}
tune_graph("hidden_units")tune_graph("penalty")tune_graph("epochs")best_auc <- nn_search_res %>%
select_best("roc_auc")
best_aucnn_wflow <- finalize_workflow(
nn_wflow, best_auc
) %>%
fit(train)bind_cols(
predict(nn_wflow, train, type="prob"),
predict(nn_wflow, train, type="class"),
train) %>%
mutate(part = "train") -> scored_nn_train
bind_cols(
predict(nn_wflow,test, type="prob"),
predict(nn_wflow,test, type="class"),
test) %>%
mutate(part = "test") -> scored_nn_testoptions(yardstick.event_first = FALSE)
# Variable Importance
nn_wflow %>%
extract_fit_parsnip() %>%
vi()nn_wflow %>%
extract_fit_parsnip() %>%
vip()scored_nn_train <- scored_nn_train %>% mutate(part = "training")
scored_nn_test <- scored_nn_test %>% mutate(part = "testing")
scored_nn_train %>% mutate(model = "train") %>%
bind_rows(scored_nn_test %>% mutate(model="test")) %>%
group_by(model) %>%
roc_curve(loan_status, .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.05, # 5% FPR
color = "red",
linetype = "longdash") +
geom_vline(xintercept = 0.25, # 25% FPR
color = "blue",
linetype = "longdash") +
geom_vline(xintercept = 0.75, # 75% FPR
color = "green",
linetype = "longdash") +
labs(title = "Neural Network ROC Curve" , x = "FPR(1 - specificity)", y = "TPR(recall)")scored_nn_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_nn_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_nn_train_mutate <- scored_nn_train %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
scored_nn_test_mutate <- scored_nn_test %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
bind_rows(scored_nn_train, scored_nn_test) %>%
group_by(part) %>%
dplyr::select(loan_status, .pred_class) -> train_test
precision <- train_test %>%
yardstick::precision(loan_status, .pred_class)
recall <- train_test %>%
yardstick::recall(loan_status, .pred_class)
scored_nn_train %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_nn_train_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_nn_train_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="training") %>%
bind_rows(scored_nn_test %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_nn_test_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_nn_test_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="testing")) %>%
bind_rows(
bind_cols(precision, recall) %>%
mutate(f1_score = 2*.estimate...4*.estimate...8/(.estimate...4 + .estimate...8)) %>%
mutate(.metric = "F1_Score") %>%
mutate(.estimator = "binary") %>%
dplyr::select(part...1, .metric, .estimator, f1_score) %>%
rename(c(".estimate" = "f1_score"))
) %>%
arrange(desc(part))# Score Distribution
scored_nn_test %>%
ggplot(aes(.pred_default,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = "nnet 1") -> scored_dist
print(scored_dist)# operating range 0 - 10%
operating_range <- scored_nn_test %>%
roc_curve(loan_status, .pred_default) %>%
mutate(
fpr = round((1 - specificity), 3),
tpr = round(sensitivity, 3),
score_threshold = round(.threshold, 5)
) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(score_threshold),4),
tpr = mean(tpr)) %>%
filter(fpr <= 0.5)
print(operating_range)## # A tibble: 501 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 Inf 0.0478
## 2 0.001 0.727 0.219
## 3 0.002 0.720 0.379
## 4 0.003 0.713 0.460
## 5 0.004 0.704 0.530
## 6 0.005 0.699 0.569
## 7 0.006 0.692 0.602
## 8 0.007 0.684 0.638
## 9 0.008 0.677 0.665
## 10 0.009 0.670 0.691
## # … with 491 more rows
# Precision Recall Chart
scored_nn_test %>%
pr_curve(loan_status, .pred_default) %>%
mutate(
recall = round(recall, 2),
.threshold = round(.threshold, 3),
precision = round(precision, 3)
) %>%
group_by(recall) %>%
summarise(precision = max(precision),
.threshold = min(.threshold))# create an explainer of a model
nn_explainer <- explain_tidymodels(
logistic_wf,
data = train ,
y = train$loan_default ,
verbose = TRUE
)## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20840 rows 32 cols
## -> data : tibble converted into a data.frame
## -> target variable : not specified! ( WARNING )
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.0000000000000002220446 , mean = 0.1518757 , max = 0.9999777
## -> residual function : difference between y and yhat ( default )
## A new explainer has been created!
nn_variable <- c('term', 'last_pymnt_d_year', 'last_pymnt_amnt', 'grade', 'int_rate')
pdp <- function(m){
# create a profile of a single variable for a model
pdp_variable <- model_profile(
nn_explainer,
variables = as.character(m)
)
# Plot it
plot(pdp_variable) +
ggtitle(paste0("Partial Dependence Plot for ", as.character(m))) +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
for (c in nn_variable){
print(pdp(c))
}mod <- nn_wflow$fit$fit$fit
plotnet(mod) kfold_splits <- vfold_cv(train, v=5)
rf_model <- rand_forest(trees=tune()) %>%
set_engine("ranger", num.threads = 5, max.depth = 10, importance="permutation") %>%
set_mode("classification")rf_wflow <- workflow() %>%
add_recipe(recipe) %>%
add_model(rf_model)
rf_search_res <- rf_wflow %>%
tune_bayes(
resamples = kfold_splits,
# Generate five at semi-random to start
initial = 5,
iter = 50,
# How to measure performance?
metrics = metric_set(yardstick::roc_auc),
control = control_bayes(no_improve = 5, verbose = TRUE)
)##
## ❯ Generating a set of 5 initial parameter results
## ✓ Initialization complete
##
##
## ── Iteration 1 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9696 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 1995 candidates
## i Predicted candidates
## i trees=1723
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9695 (+/-0.00182)
##
## ── Iteration 2 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9696 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 1994 candidates
## i Predicted candidates
## i trees=1666
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9695 (+/-0.00176)
##
## ── Iteration 3 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9696 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 1993 candidates
## i Predicted candidates
## i trees=1694
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9693 (+/-0.00182)
##
## ── Iteration 4 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9696 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 1992 candidates
## i Predicted candidates
## i trees=711
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9694 (+/-0.00175)
##
## ── Iteration 5 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9696 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 1991 candidates
## i Predicted candidates
## i trees=460
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9695 (+/-0.00162)
## ! No improvement for 5 iterations; returning current results.
highest_rf_auc <- rf_search_res %>%
select_best("roc_auc")
highest_rf_aucrf_wflow <- finalize_workflow(
rf_wflow, highest_rf_auc
) %>%
fit(train) # score training
predict(rf_wflow, train, type="prob") %>%
bind_cols(predict(rf_wflow, train, type="class")) %>%
bind_cols(., train) -> scored_train_rf
# score testing
predict(rf_wflow, test, type="prob") %>%
bind_cols(predict(rf_wflow, test, type="class")) %>%
bind_cols(., test) -> scored_test_rfoptions(yardstick.event_first = FALSE)
# Variable Importance
rf_wflow %>%
extract_fit_parsnip() %>%
vi()rf_wflow %>%
extract_fit_parsnip() %>%
vip()scored_train_rf <- scored_train_rf %>% mutate(part = "training")
scored_test_rf <- scored_test_rf %>% mutate(part = "testing")
scored_train_rf %>% mutate(model = "train") %>%
bind_rows(scored_test_rf %>% mutate(model="test")) %>%
group_by(model) %>%
roc_curve(loan_status, .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.05, # 5% FPR
color = "red",
linetype = "longdash") +
geom_vline(xintercept = 0.25, # 25% FPR
color = "blue",
linetype = "longdash") +
geom_vline(xintercept = 0.75, # 75% FPR
color = "green",
linetype = "longdash") +
labs(title = "Random Forest ROC Curve" , x = "FPR(1 - specificity)", y = "TPR(recall)")scored_train_rf %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_test_rf %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_train_rf_mutate <- scored_train_rf %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
scored_test_rf_mutate <- scored_test_rf %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
bind_rows(scored_train_rf, scored_test_rf) %>%
group_by(part) %>%
dplyr::select(loan_status, .pred_class) -> train_test
precision <- train_test %>%
yardstick::precision(loan_status, .pred_class)
recall <- train_test %>%
yardstick::recall(loan_status, .pred_class)
scored_train_rf %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="training") %>%
bind_rows(scored_test_rf %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="testing")) %>%
bind_rows(
bind_cols(precision, recall) %>%
mutate(f1_score = 2*.estimate...4*.estimate...8/(.estimate...4 + .estimate...8)) %>%
mutate(.metric = "F1_Score") %>%
mutate(.estimator = "binary") %>%
dplyr::select(part...1, .metric, .estimator, f1_score) %>%
rename(c(".estimate" = "f1_score"))
) %>%
arrange(desc(part))# Score Distribution
scored_test_rf %>%
ggplot(aes(.pred_default,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = "nnet 1") -> scored_dist
print(scored_dist)# operating range 0 - 10%
operating_range <- scored_test_rf %>%
roc_curve(loan_status, .pred_default) %>%
mutate(
fpr = round((1 - specificity), 3),
tpr = round(sensitivity, 3),
score_threshold = round(.threshold, 5)
) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(score_threshold),4),
tpr = mean(tpr)) %>%
filter(fpr <= 0.5)
print(operating_range)## # A tibble: 501 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 Inf 0.0703
## 2 0.001 0.730 0.235
## 3 0.002 0.635 0.407
## 4 0.003 0.585 0.501
## 5 0.004 0.567 0.536
## 6 0.005 0.554 0.565
## 7 0.006 0.534 0.600
## 8 0.007 0.515 0.635
## 9 0.008 0.505 0.656
## 10 0.009 0.496 0.676
## # … with 491 more rows
# Precision Recall Chart
scored_test_rf %>%
pr_curve(loan_status, .pred_default) %>%
mutate(
recall = round(recall, 2),
.threshold = round(.threshold, 3),
precision = round(precision, 3)
) %>%
group_by(recall) %>%
summarise(precision = max(precision),
.threshold = min(.threshold))# create an explainer of a model
rf_explainer <- explain_tidymodels(
logistic_wf,
data = train ,
y = train$loan_default ,
verbose = TRUE
)## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20840 rows 32 cols
## -> data : tibble converted into a data.frame
## -> target variable : not specified! ( WARNING )
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.0000000000000002220446 , mean = 0.1518757 , max = 0.9999777
## -> residual function : difference between y and yhat ( default )
## A new explainer has been created!
rf_variable <- c('last_pymnt_amnt', 'last_pymnt_d_year', 'last_credit_pull_d_year', 'term', 'issue_d_year')
pdp <- function(m){
# create a profile of a single variable for a model
pdp_variable <- model_profile(
rf_explainer,
variables = as.character(m)
)
# Plot it
plot(pdp_variable) +
ggtitle(paste0("Partial Dependence Plot for ", as.character(m))) +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
for (c in rf_variable){
print(pdp(c))
}Here we want to TUNE our XGB model using the Bayes method.
xgb_model <- boost_tree(trees = tune(),
learn_rate = tune(),
tree_depth = tune()) %>%
set_engine("xgboost",
importance="permutation") %>%
set_mode("classification")
xgb_wflow <- workflow() %>%
add_recipe(recipe) %>%
add_model(xgb_model)
xgb_search_res <- xgb_wflow %>%
tune_bayes(
resamples = kfold_splits,
# Generate five at semi-random to start
initial = 5,
iter = 50,
metrics = metric_set(yardstick::roc_auc),
control = control_bayes(no_improve = 5, verbose = TRUE)
)##
## ❯ Generating a set of 5 initial parameter results
## ✓ Initialization complete
##
##
## ── Iteration 1 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9798 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i trees=1668, tree_depth=8, learn_rate=0.306
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.978 (+/-0.000737)
##
## ── Iteration 2 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9798 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i trees=1968, tree_depth=11, learn_rate=0.00122
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9769 (+/-0.00115)
##
## ── Iteration 3 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9798 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i trees=1426, tree_depth=4, learn_rate=0.00318
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9741 (+/-0.00133)
##
## ── Iteration 4 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9798 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i trees=1777, tree_depth=9, learn_rate=0.001
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9759 (+/-0.00107)
##
## ── Iteration 5 ─────────────────────────────────────────────────────────────────
##
## i Current best: roc_auc=0.9798 (@iter 0)
## i Gaussian process model
## ✓ Gaussian process model
## i Generating 5000 candidates
## i Predicted candidates
## i trees=1827, tree_depth=10, learn_rate=0.076
## i Estimating performance
## i Fold1: preprocessor 1/1
## ✓ Fold1: preprocessor 1/1
## i Fold1: preprocessor 1/1, model 1/1
## ✓ Fold1: preprocessor 1/1, model 1/1
## i Fold1: preprocessor 1/1, model 1/1 (predictions)
## i Fold2: preprocessor 1/1
## ✓ Fold2: preprocessor 1/1
## i Fold2: preprocessor 1/1, model 1/1
## ✓ Fold2: preprocessor 1/1, model 1/1
## i Fold2: preprocessor 1/1, model 1/1 (predictions)
## i Fold3: preprocessor 1/1
## ✓ Fold3: preprocessor 1/1
## i Fold3: preprocessor 1/1, model 1/1
## ✓ Fold3: preprocessor 1/1, model 1/1
## i Fold3: preprocessor 1/1, model 1/1 (predictions)
## i Fold4: preprocessor 1/1
## ✓ Fold4: preprocessor 1/1
## i Fold4: preprocessor 1/1, model 1/1
## ✓ Fold4: preprocessor 1/1, model 1/1
## i Fold4: preprocessor 1/1, model 1/1 (predictions)
## i Fold5: preprocessor 1/1
## ✓ Fold5: preprocessor 1/1
## i Fold5: preprocessor 1/1, model 1/1
## ✓ Fold5: preprocessor 1/1, model 1/1
## i Fold5: preprocessor 1/1, model 1/1 (predictions)
## ✓ Estimating performance
## ⓧ Newest results: roc_auc=0.9789 (+/-0.00114)
## ! No improvement for 5 iterations; returning current results.
highest_xgb_auc <- xgb_search_res %>%
select_best("roc_auc")
highest_xgb_aucxgb_wflow <- finalize_workflow(
xgb_wflow, highest_xgb_auc
) %>%
fit(train)## [20:11:19] WARNING: amalgamation/../src/learner.cc:627:
## Parameters: { "importance" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
# -- score training
predict(xgb_wflow, train, type="prob") %>%
bind_cols(predict(xgb_wflow, train, type="class")) %>%
bind_cols(., train) -> scored_train_xgb
# -- score testing
predict(xgb_wflow, test, type="prob") %>%
bind_cols(predict(xgb_wflow, test, type="class")) %>%
bind_cols(., test) -> scored_test_xgb options(yardstick.event_first = FALSE)
# Variable Importance
xgb_wflow %>%
extract_fit_parsnip() %>%
vi()xgb_wflow %>%
extract_fit_parsnip() %>%
vip()scored_train_xgb <- scored_train_xgb %>% mutate(part = "training")
scored_test_xgb <- scored_test_xgb %>% mutate(part = "testing")
scored_train_xgb %>% mutate(model = "train") %>%
bind_rows(scored_test_xgb %>% mutate(model="test")) %>%
group_by(model) %>%
roc_curve(loan_status, .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.05, # 5% FPR
color = "red",
linetype = "longdash") +
geom_vline(xintercept = 0.25, # 25% FPR
color = "blue",
linetype = "longdash") +
geom_vline(xintercept = 0.75, # 75% FPR
color = "green",
linetype = "longdash") +
labs(title = "XGBoost ROC Curve" , x = "FPR(1 - specificity)", y = "TPR(recall)")scored_train_xgb %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_test_xgb %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current'))) %>%
conf_mat(loan_status, estimate = predict_class) %>%
autoplot(type = "heatmap") +
labs(title="confusion matrix threshold >= 0.5")scored_train_xgb_mutate <- scored_train_xgb %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
scored_test_xgb_mutate <- scored_test_xgb %>%
mutate(predict_class = as.factor(if_else(.pred_default >=0.5,'default','current')))
bind_rows(scored_train_rf, scored_test_rf) %>%
group_by(part) %>%
dplyr::select(loan_status, .pred_class) -> train_test
precision <- train_test %>%
yardstick::precision(loan_status, .pred_class)
recall <- train_test %>%
yardstick::recall(loan_status, .pred_class)
scored_train_xgb %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="training") %>%
bind_rows(scored_test_xgb %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::precision(loan_status, predict_class)) %>%
bind_rows(scored_train_rf_mutate %>% yardstick::recall(loan_status, predict_class)) %>%
filter(.metric %in% c("accuracy", "roc_auc", "mn_log_loss", "precision", "recall")) %>%
mutate(part="testing")) %>%
bind_rows(
bind_cols(precision, recall) %>%
mutate(f1_score = 2*.estimate...4*.estimate...8/(.estimate...4 + .estimate...8)) %>%
mutate(.metric = "F1_Score") %>%
mutate(.estimator = "binary") %>%
dplyr::select(part...1, .metric, .estimator, f1_score) %>%
rename(c(".estimate" = "f1_score"))
) %>%
arrange(desc(part))# Score Distribution
scored_test_xgb %>%
ggplot(aes(.pred_default,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = "nnet 1") -> scored_dist
print(scored_dist)# operating range 0 - 10%
operating_range <- scored_test_xgb %>%
roc_curve(loan_status, .pred_default) %>%
mutate(
fpr = round((1 - specificity), 3),
tpr = round(sensitivity, 3),
score_threshold = round(.threshold, 5)
) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(score_threshold),4),
tpr = mean(tpr)) %>%
filter(fpr <= 0.5)
print(operating_range)## # A tibble: 501 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 Inf 0.157
## 2 0.001 0.998 0.406
## 3 0.002 0.994 0.556
## 4 0.003 0.987 0.635
## 5 0.004 0.980 0.684
## 6 0.005 0.968 0.727
## 7 0.006 0.948 0.763
## 8 0.007 0.928 0.784
## 9 0.008 0.910 0.803
## 10 0.009 0.873 0.825
## # … with 491 more rows
# Precision Recall Chart
scored_test_xgb %>%
pr_curve(loan_status, .pred_default) %>%
mutate(
recall = round(recall, 2),
.threshold = round(.threshold, 3),
precision = round(precision, 3)
) %>%
group_by(recall) %>%
summarise(precision = max(precision),
.threshold = min(.threshold))# create an explainer of a model
xgb_explainer <- explain_tidymodels(
logistic_wf,
data = train ,
y = train$loan_default ,
verbose = TRUE
)## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20840 rows 32 cols
## -> data : tibble converted into a data.frame
## -> target variable : not specified! ( WARNING )
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.0000000000000002220446 , mean = 0.1518757 , max = 0.9999777
## -> residual function : difference between y and yhat ( default )
## A new explainer has been created!
xgb_variable <- c('last_pymnt_amnt', 'last_pymnt_d_year', 'last_credit_pull_d_year', 'issue_d_year', 'loan_amnt')
pdp <- function(m){
# create a profile of a single variable for a model
pdp_variable <- model_profile(
xgb_explainer,
variables = as.character(m)
)
# Plot it
plot(pdp_variable) +
ggtitle(paste0("Partial Dependence Plot for ", as.character(m))) +
theme(axis.text.x=element_text(angle=45, hjust=1))
}
for (c in xgb_variable){
print(pdp(c))
}# your model variables of interest
model_variables = c('.pred_default', 'loan_amnt', 'annual_inc', 'inq_last_6mths', 'last_pymnt_amnt', 'term', 'int_rate', 'emp_length', 'purpose', 'issue_d_year', 'last_pymnt_d_year', 'grade', 'last_credit_pull_d_year', 'funded_amnt', 'fico_range_low', 'total_rec_late_fee')
# step 1. create explainer
xgb_explainer <-
explain_tidymodels(
xgb_wflow, # fitted workflow object
# data = train,
data = train %>% sample_n(1000), # original training data
y = train$loan_status, # predicted outcome
label = "randomforst",
verbose = FALSE
)
# step 2. get the record you want to predict
single_record <- scored_test_xgb %>% dplyr::select(model_variables) %>%
mutate(intercept = "", prediction = .pred_default) %>%
slice_max(order_by = .pred_default, n=10) %>% head(1)
# step 3. run the explainer
xgb_breakdown <- predict_parts(explainer = xgb_explainer,
new_observation = single_record
)
# step 4. plot it.
# you notice you don't get categorical values ...
xgb_breakdown %>% plot()# --- more involved explanations with categories. ----
# step 4a.. convert breakdown to a tibble so we can join it
xgb_breakdown %>%
as_tibble() -> breakdown_data
# step 4b. transpose your single record prediction
single_record %>%
gather(key="variable_name",value="value") -> prediction_data
# step 4c. get a predicted probability for plot
prediction_prob <- single_record[,".pred_default"] %>% pull()
# step 5. plot it.
breakdown_data %>%
inner_join(prediction_data) %>%
mutate(contribution = round(contribution,3),) %>%
filter(variable_name != "intercept") %>%
mutate(variable = paste(variable_name,value,sep = ": ")) %>%
ggplot(aes(y=reorder(variable, contribution), x= contribution, fill=sign)) +
geom_col() +
geom_text(aes(label=contribution),
size=4,
position=position_dodge(width=0.7),
vjust=0.5,
)+
labs(
title = "DALEX explainations",
subtitle = paste("predicted:",as.character(round(prediction_prob,3))),
x="contribution",
y="features")# step 3. run the explainer
xgb_shapley <- predict_parts(explainer = xgb_explainer,
new_observation = single_record,
type="shap")
# step 4. plot it.
# you notice you don't get categorical values ...
xgb_shapley %>% plot()# --- more involved explanations with categories. ----
# step 4a.. convert breakdown to a tibble so we can join it
xgb_shapley %>%
as_tibble() -> shap_data
# step 4b. transpose your single record prediction
single_record %>%
gather(key="variable_name",value="value") -> prediction_data
# step 4c. get a predicted probability for plot
prediction_prob <- single_record[,".pred_default"] %>% mutate(.pred_default = round(.pred_default,3)) %>% pull()
# step 5. plot it.
shap_data %>%
inner_join(prediction_data) %>%
mutate(variable = paste(variable_name,value,sep = ": ")) %>%
group_by(variable) %>%
summarize(contribution = mean(contribution)) %>%
mutate(contribution = round(contribution,3),
sign = if_else(contribution < 0, "neg","pos")) %>%
ggplot(aes(y=reorder(variable, contribution), x= contribution, fill=sign)) +
geom_col() +
geom_text(aes(label=contribution))+
labs(
title = "SHAPLEY explainations",
subtitle = paste("predicted probablity = ",prediction_prob) ,
x="contribution",
y="features")xgb_explainer <- explain_tidymodels(
rf_wflow, # fitted workflow object
data = train %>% sample_n(1000), # original training data
y = train$loan_status, # predicted outcome
label = "randomforest",
verbose = FALSE
)
explain_prediction <- function(single_record){
# step 3. run the explainer
record_shap <- predict_parts(explainer = xgb_explainer,
new_observation = single_record,
type="shap")
# step 4. plot it.
# you notice you don't get categorical values ...
record_shap %>% plot() %>% print()
# --- more involved explanations with categories. ----
# step 4a.. convert breakdown to a tibble so we can join it
record_shap %>%
as_tibble() -> shap_data
# step 4b. transpose your single record prediction
single_record %>%
gather(key="variable_name",value="value") -> prediction_data
# step 4c. get a predicted probability for plot
prediction_prob <- single_record[,".pred_default"] %>% mutate(.pred_default = round(.pred_default,3)) %>% pull()
# step 5. plot it.
shap_data %>%
inner_join(prediction_data) %>%
mutate(variable = paste(variable_name,value,sep = ": ")) %>%
group_by(variable) %>%
summarize(contribution = mean(contribution)) %>%
mutate(contribution = round(contribution,3),
sign = if_else(contribution < 0, "neg","pos")) %>%
ggplot(aes(y=reorder(variable, contribution), x= contribution, fill=sign)) +
geom_col() +
geom_text(aes(label=contribution))+
labs(
title = "SHAPLEY explainations",
subtitle = paste("predicted probablity = ",prediction_prob) ,
x="contribution",
y="features")
}
scored_test_xgb %>%
filter(.pred_class == 'default') %>%
filter(.pred_class == 'default') %>%
slice_max(.pred_default,n=10)scored_test_xgb %>%
filter(.pred_class == 'default') %>%
filter(loan_status != 'default' ) %>%
slice_max(.pred_default,n=10)scored_test_xgb %>%
filter(.pred_class != 'default') %>%
filter(loan_status == 'default' ) %>%
slice_min(.pred_default,n=10)top_5_tp <- scored_test_xgb %>%
filter(.pred_class == 'default') %>%
filter(.pred_class == 'default') %>%
slice_max(.pred_default,n=5)
top_5_fp <- scored_test_xgb %>%
filter(.pred_class == 'default') %>%
filter(loan_status != 'default' ) %>%
slice_max(.pred_default,n=5)
top_5_fn <- scored_test_xgb %>%
filter(.pred_class != 'default') %>%
filter(loan_status == 'default' ) %>%
slice_min(.pred_default,n=5)
# repeat for TP, FP and FN
for (row in 1:nrow(top_5_tp)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
} for (row in 1:nrow(top_5_fp)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
} for (row in 1:nrow(top_5_fn)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
} kaggle_prediction <- predict(rf_wflow, loan_kaggle_decimal, type = "prob") %>%
bind_cols(predict(rf_wflow, loan_kaggle_decimal, type = "class")) %>%
bind_cols(loan_kaggle_decimal) %>%
dplyr:::select.data.frame(id, loan_status = .pred_default)
head(kaggle_prediction) kaggle_prediction %>% write_csv("Eagle Xuhui Ying_prediction.csv")